#! /usr/bin/perl

use strict;
use warnings;

if (-t 0 || @ARGV) {
    print <<EOF;
$0: generate code page tables from ICU encoding list
usage: $0 < convrtrs.txt > sys-file-encoding.c

To regenerate the encoding data, get the latest ICU encoding data from:
http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
then convert it with this script using the command above.
EOF
    exit (@ARGV && $ARGV[0] eq '--help' ? 0 : 1);
}

open (CONVERTERS, '<', 'convrtrs.txt')
  or die "convrtrs.txt: open failed ($!)\n";

our $WINDOWS = 3;		# Windows code pages.
our $IBM = 2;			# IBM code pages.
our $CP = 1;			# Java (?) code pages.
our %sources = ($WINDOWS => "windows", $IBM => "ibm", $CP => "cp");

my $converter = "";
while (<CONVERTERS>) {
    chomp;
    s/#.*//;
    if (s/^\s+//) {
	$converter .= " $_";
    } else {
	process_converter ($converter);
	$converter = $_;
    }
}
process_converter ($converter);
close (CONVERTERS);

our %codepages;

print <<'EOF';
/* -*- mode: c; buffer-read-only: t -*-

   Generated by sys-file-encoding.pl.  Do not modify!
*/

#include <config.h>

#include "data/sys-file-private.h"

struct sys_encoding sys_codepage_number_to_name[] = {
EOF
for my $cpnumber (sort { $a <=> $b } (keys (%codepages))) {
    my $source = max (keys (%{$codepages{$cpnumber}}));
    my $name = ${$codepages{$cpnumber}{$source}}[0];
    print "  { $cpnumber, \"$name\" },\n";
}
print "  { 0, NULL }\n";
print "};\n\n";

my %names;
for my $cpnumber (sort { $a <=> $b } (keys (%codepages))) {
    for my $source (keys (%{$codepages{$cpnumber}})) {
	for my $name (@{$codepages{$cpnumber}{$source}}) {
	    push(@{$names{$name}{$source}}, $cpnumber);
	}
    }
}
print "struct sys_encoding sys_codepage_name_to_number[] = {\n";
for my $name (sort (keys (%names))) {
    for my $source (reverse (sort (keys (%sources)))) {
	next if !exists ($names{$name}{$source});
	my (@numbers) = @{$names{$name}{$source}};

	# The only two encodings that currently print this are KSC_5601
	# and KS_C_5601-1987, for code pages 949 and 51949.  It looks to
	# me like the correct code page number is 949, which is the one
	# chosen (because the numbers are in sorted order).
	print "  /* $name has multiple numbers for $sources{$source}: @numbers */\n"
	  if @numbers > 1;

	print "  { $numbers[0], \"$name\" },\n";
	last;
    }
}
print "  { 0, NULL }\n";
print "};\n";

sub process_converter {
    my ($converter) = @_;
    return if $converter =~ /^\s*$/;
    return if $converter =~ /^\s*\{/;

    my %cps;
    my @iana;
    my @other;

    my @fields = split (' ', $converter);
    while (@fields) {
	my $name = shift (@fields);
	if (@fields && $fields[0] eq '{') {
	    shift (@fields);

	    my (%standards);
	    for (;;) {
		my $standard = shift (@fields);
		last if $standard eq '}';
		$standards{$standard} = 1;
	    }
	    if (exists $standards{'IANA*'}) {
		unshift (@iana, $name);
	    } elsif (exists $standards{'IANA'}) {
		push (@iana, $name);
	    } elsif (grep (/\*$/, keys %standards)) {
		unshift (@other, $name);
	    } else {
		push (@other, $name);
	    }
	} else {
	    # Untagged names are completely nonstandard.
	    next;
	}

	my $number;
	if (($number) = $name =~ /^cp([0-9]+)$/) {
	    $cps{$CP} = int ($number);
	} elsif (($number) = $name =~ /^windows-([0-9]+)$/) {
	    $cps{$WINDOWS} = int ($number);
	} elsif (($number) = $name =~ /^ibm-([0-9]+)$/) {
	    $cps{$IBM} = int ($number);
	} else {
	    next;
	}
    }

    # If there are no tagged names then this is completely nonstandard.
    return if !@iana && !@other;

    $codepages{$cps{$_}}{$_} = [@iana, @other] for keys (%cps);
}

sub max {
    my ($best);
    for my $x (@_) {
	$best = $x if !defined ($best) || $x > $best;
    }
    return $best;
}
